/*

 Program prepareimsshh.do selects similar samples from the IMSS and
 ENEU data and harmonizes variables between
 them, to get ready to run regressions.

 Kumler, Verhoogen, Frias "Enlisting Employees ..." REStat forthcoming

*/



***************** housekeeping ******************;

#delimit;
set more off;

do ${code}housekeeping.do;

tempfile d1 d2;


*********************************************;
************ auxiliary datasets *************;
*********************************************;

*** get employer contributions file ready;

clear all;
use ${work}imss_employer_contrib;
sort year qtr;
save `d1', replace;

*** get worker contributions file ready;

clear all;
use ${work}imss_worker_contrib;
sort year qtr;
save `d2', replace;


*********************************************;
************ prepare IMSS data **************;
*********************************************;

* loop over years 1985-2003;
* loop over quarters (months 3, 6, 9, 12);

forval yr = 1985/2004
{;

 forval month = 3(3)12
 {;

  di "Year = `yr', Month = `month'";

  ** get public sector workers file ready;

  use nss sal year qtr male main_job age mpio_imss frac rsal1 rsal2 rsal3 rsal4 rsal5 rsal6 salmin
      rsalmin empl_imss firmsize salmindf main_job entmpio_inegi ind cpi registro rama_imss
      using ${work}indlevelpubsec`yr'`month', clear;

  tempfile pubsec;
  save `pubsec', replace;

  ** load non-public worker data and append public worker data;

  use nss sal year qtr male main_job age mpio_imss frac rsal1 rsal2 rsal3 rsal4 rsal5 rsal6 salmin
      rsalmin empl_imss firmsize salmindf main_job entmpio_inegi ind cpi registro rama_imss
      using ${work}indlevel`yr'`month', clear;

  append using `pubsec';

  * Note: The ENEU only asks about the main job, so to remain consistent focus on main job here;
  tab main_job, missing;
  keep if main_job==1;
  drop main_job;

  * keep workers between ages 16-65 inclusive -- those are the cutoffs we used in the ENEU;

  keep if age>=16 & age<=65;

  * drop if sex is missing;
  keep if male==0 | male==1;

  * calculate real value of min wage in DF -- to be used below;
  gen rsalmindf = salmindf/(cpi/100);

  * impose a lower bound of 30 pesos (real terms) on reported wages (only applies pre-1991);

  keep if rsal1 >= 30;

  * create wage variable that has minimum topcode over period, but bottomcode same as rsal1;

  * find topcode;
  sum rsal4;
  local topcode=r(max);
  gen rsal7 = rsal1;
  replace rsal7 = `topcode' if rsal7>`topcode';

  * create wage variable that has bottomcode 1.045*MW and minimum topcode over period;

  gen rsal8 = rsal1;
  replace rsal8 = `topcode' if rsal8>`topcode';
  replace rsal8 = 1.045*rsalmin if rsal8<1.045*rsalmin;

  * create wage variable that has bottomcode of 30 pesos and minimum topcode over period;

  gen rsal9 = rsal1;
  replace rsal9 = `topcode' if rsal8>`topcode';
  replace rsal9 = 30 if rsal8<30;

  * generate entidad variables;

  gen ent_imss=substr(entmpio_inegi,1,2);
  assert ent_imss~="", r;

  * create indicators for wage near minimum wage -- to get rough measure of stacking;

  gen byte imss_lt105mw = 0;
  replace imss_lt105mw = 1 if rsal1<1.05*rsalmin;
  gen byte imss_lt110mw = 0;
  replace imss_lt110mw = 1 if rsal1<1.10*rsalmin;

  * create more aggregated firm size variables;
  gen firmsize2 = 0 if inlist(firmsize, 1, 2, 3, 4, 5, 6);
  replace firmsize2 = 1 if inlist(firmsize,7,8);

  gen firmsize3 = 1 if inlist(firmsize, 1, 2, 3);
  replace firmsize3 = 2 if inlist(firmsize, 4, 5);
  replace firmsize3 = 3 if inlist(firmsize,6);
  replace firmsize3 = 4 if inlist(firmsize,7);
  replace firmsize3 = 5 if inlist(firmsize,8);

  label define fslabel 1 "1-10 emp." 2 "11-50 emp." 3 "51-100 emp."
     4 "101-250 emp." 5 ">250 emp.";
  label values firmsize3 fslabel;

  * create aggregated sector variable;
  gen sector = .;
  replace sector = 1 if inrange(rama_imss,"20","30") | inrange(rama_imss,"32","39");
  replace sector = 2 if inrange(rama_imss,"41","42");
  replace sector = 3 if inrange(rama_imss,"61","69");

  replace sector = 4 if frac=="9403";

  label define sectorlabel 1 "manufacturing" 2 "construction" 3 "services/retail" 4 "social security";
  label values sector sectorlabel;

  compress;

  gen byte imss=1;

  * make age categories;

  tab age;

  gen byte age_cat=1 if age>=16 & age<=20;
  replace age_cat=2 if age>=21 & age<=25;
  replace age_cat=3 if age>=26 & age<=30;
  replace age_cat=4 if age>=31 & age<=35;
  replace age_cat=5 if age>=36 & age<=40;
  replace age_cat=6 if age>=41 & age<=45;
  replace age_cat=7 if age>=46 & age<=50;
  replace age_cat=8 if age>=51 & age<=55;
  replace age_cat=9 if age>=56 & age<=60;
  replace age_cat=10 if age>=61 & age<=65;

  gen byte age_cat2=.;
  replace age_cat2 = 1 if age_cat==1 | age_cat==2;
  replace age_cat2 = 2 if age_cat==3 | age_cat==4;
  replace age_cat2 = 3 if age_cat==5 | age_cat==6;
  replace age_cat2 = 4 if age_cat==7 | age_cat==8;
  replace age_cat2 = 5 if age_cat==9 | age_cat==10;

  *** merge in employer and worker contribution rates and calculate IMSS
    net wage;

  ** employer contributions;

  sort year qtr;
  merge m:1 year qtr using `d1', assert(2 3);
  keep if _merge==3;
  drop _merge;

  * value of the employer contribution (originally calculated in contributions_imss.do);

  gen rimss_emp_contrib = (

                    (rsal7<=limit_sem*rsalmindf)*(sem/100)*rsal7 +
                    (rsal7>limit_sem*rsalmindf)*(sem/100)*limit_sem*rsalmindf +
                    (rsal7<=limit_sem*rsalmindf)*(sem_d/100)*rsal7 +
                    (rsal7>limit_sem*rsalmindf)*(sem_d/100)*limit_sem*rsalmindf +

                    (rsal7<=limit_siv*rsalmindf)*(sivcm/100)*rsal7 +
                    (rsal7>limit_siv*rsalmindf)*(sivcm/100)*limit_siv*rsalmindf +
                    (rsal7<=limit_siv*rsalmindf)*(siv/100)*rsal7 +
                    (rsal7>limit_siv*rsalmindf)*(siv/100)*limit_siv*rsalmindf +

                    (rsal7<=limit_srcv_r*rsalmindf)*(srcv_r/100)*rsal7 +
                    (rsal7>limit_srcv_r*rsalmindf)*(srcv_r/100)*limit_srcv_r*rsalmindf +

                    (rsal7<=limit_srcv_c*rsalmindf)*(srcv_c/100)*rsal7 +
                    (rsal7>limit_srcv_c*rsalmindf)*(srcv_c/100)*limit_srcv_c*rsalmindf +

                    (rsal7<=limit_sgps*rsalmindf)*(sgps/100)*rsal7 +
                    (rsal7>limit_sgps*rsalmindf)*(sgps/100)*limit_sgps*rsalmindf +

                    (rsal7<=limit_cpda*rsalmin)*(cpda/100)*rsal7 +
                    (rsal7>limit_cpda*rsalmin)*(cpda/100)*limit_cpda*rsalmin +

                    (rsal7<=limit_siv*rsalmindf)*(aport_esp/100)*rsal7 +
                    (rsal7>limit_siv*rsalmindf)*(aport_esp/100)*limit_siv*rsalmindf +

                    (rsal7<=limit_sem*rsalmindf)*(rsal7>=3*rsalmindf)*(extra/100)*(rsal7-3*rsalmindf) +
                    (rsal7>limit_sem*rsalmindf)*(rsal7>=3*rsalmindf)*(extra/100)
                          *(limit_sem*rsalmindf-3*rsalmindf) +

                    (sem_fija/100)*rsalmindf

                    );

  gen imss_emp_contrib_pct = (rimss_emp_contrib / rsal7)*100;

  * drop variables from contributions file;

  drop sem sem_d sivcm siv srcv_r srcv_c sgps aport_esp extra cpda
    sem_fija srt limit_siv limit_srcv_c limit_sem limit_srcv_r
    limit_sgps limit_srt limit_cpda;

  ** worker contributions;

  sort year qtr;
  merge m:1 year qtr using `d2', assert(2 3);
  keep if _merge==3;
  drop _merge;

  * value of the worker contribution (originally calculated in contributions_imss.do);
  gen rimss_worker_contrib = (

                    (rsal7<=limit_sem*rsalmindf)*(sem/100)*rsal7 +
                    (rsal7>limit_sem*rsalmindf)*(sem/100)*limit_sem*rsalmindf +
                    (rsal7<=limit_sem*rsalmindf)*(sem_d/100)*rsal7 +
                    (rsal7>limit_sem*rsalmindf)*(sem_d/100)*limit_sem*rsalmindf +

                    (rsal7<=limit_siv*rsalmindf)*(sivcm/100)*rsal7 +
                    (rsal7>limit_siv*rsalmindf)*(sivcm/100)*limit_siv*rsalmindf +
                    (rsal7<=limit_siv*rsalmindf)*(siv/100)*rsal7 +
                    (rsal7>limit_siv*rsalmindf)*(siv/100)*limit_siv*rsalmindf +

                    (rsal7<=limit_srcv_r*rsalmindf)*(srcv_r/100)*rsal7 +
                    (rsal7>limit_srcv_r*rsalmindf)*(srcv_r/100)*limit_srcv_r*rsalmindf +

                    (rsal7<=limit_srcv_c*rsalmindf)*(srcv_c/100)*rsal7 +
                    (rsal7>limit_srcv_c*rsalmindf)*(srcv_c/100)*limit_srcv_c*rsalmindf +

                    (rsal7<=limit_sgps*rsalmindf)*(sgps/100)*rsal7 +
                    (rsal7>limit_sgps*rsalmindf)*(sgps/100)*limit_sgps*rsalmindf +

                    (rsal7<=limit_cpda*rsalmin)*(cpda/100)*rsal7 +
                    (rsal7>limit_cpda*rsalmin)*(cpda/100)*limit_cpda*rsalmin +

                    (rsal7<=limit_siv*rsalmindf)*(aport_esp/100)*rsal7 +
                    (rsal7>limit_siv*rsalmindf)*(aport_esp/100)*limit_siv*rsalmindf +

                    (rsal7<=limit_sem*rsalmindf)*(rsal7>=3*rsalmindf)*(extra/100)*(rsal7-3*rsalmindf) +
                    (rsal7>limit_sem*rsalmindf)*(rsal7>=3*rsalmindf)*(extra/100)*
                             (limit_sem*rsalmindf-3*rsalmindf) +
                    (sem_fija/100)*rsalmindf

                    );

  gen imss_worker_contrib_pct = (rimss_worker_contrib / rsal7)*100;

  * drop variables from contributions file;

  drop sem sem_d sivcm siv srcv_r srcv_c sgps aport_esp extra cpda
    sem_fija srt limit_siv limit_srcv_c limit_sem limit_srcv_r
    limit_sgps limit_srt limit_cpda;

  * calculate net wage, using contribution rates;
  gen rsal7net = rsal7*(1-(imss_emp_contrib_pct/100)-(imss_worker_contrib_pct/100));

  *** select on municipalities;

  *merge on metro area list created by eneu_metro_counts.do;
  sort entmpio_inegi;

  merge entmpio_inegi using ${work}eneu_metro_list, uniqusing;

  tab _merge;
  tab entmpio_inegi if _m==2;
  drop if _m==2;
  drop _merge;

  replace metro_area = 1 if entmpio_inegi=="09001";
  replace first_year = 1987 if entmpio_inegi=="09001";
  replace first_qtr = 1 if entmpio_inegi=="09001";

  if year>=1987
  {;
   * keep only obs that can be mapped to a metro area in the ENEU that has
     been in sample since 1987;

   keep if inlist(metro_area, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 20, 21, 22, 23);
   keep if (first_year<year) | (first_year==year & first_qtr<=qtr);
   tab metro_area, missing;
   drop first_year first_qtr last_year last_qtr;
  };

  * save version including IMSS employees;
  save ${work}imss_forhh_wpubsec_`yr'`month', replace;

   * save version not including IMSS employees;
  drop if sector==4;
  save ${work}imss_forhh_`yr'`month', replace;

 }; *end of loop over quarters;
}; *end of loop over years;


*********************************************;
************ prepare ENEU data **************;
*********************************************;

clear;
use ${work}eneu19872004_web.dta;

* select years;

keep if year>=1988 & year<=2003;

*only counting those who worked last week;
*Could include other individuals who answered work questions;
keep if work_lwk==1;
drop work_lwk;

drop metro_area;

sort entmpio_inegi;

merge entmpio_inegi using ${work}eneu_metro_list, uniqusing;

tab _merge;

* Include only metro areas that have been in ENEU since 1987;
keep if inlist(metro_area, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 20, 21, 22, 23);
keep if (first_year<year) | (first_year==year & first_qtr<=qtr);

** sample selection;

* keep workers who work for a fixed wage, work on commission or are
  members of a cooperative. Drop self-employed workers, owners, and
  unpaid workers;

keep if employee_type==4 | employee_type==5 | employee_type==6;

* age criterion;
keep if age>=16 & age<=65;

* drop if sex is missing;
keep if male==0 | male==1;

* drop individuals with missing salary;
keep if hsal~=. & sal~=.;

* impose a lower bound of 30 pesos (real terms) on reported wages (only applies pre-1991);
keep if rsal1 >= 30;

* create wage variable that has minimum topcode over period, but bottomcode same as rsal1;

* find topcode;
sum rsal4;
local topcode=r(max);
gen rsal7 = rsal1;
replace rsal7 = `topcode' if rsal7>`topcode';

* create wage variable that has bottomcode 1.045*MW and minimum topcode over period;

gen rsal8 = rsal1;
replace rsal8 = `topcode' if rsal8>`topcode';
replace rsal8 = 1.045*rsalmin if rsal8<1.045*rsalmin;

* create wage variable that has bottomcode of 30 pesos and minimum topcode over period;

gen rsal9 = rsal1;
replace rsal9 = `topcode' if rsal8>`topcode';
replace rsal9 = 30 if rsal8<30;

* create indicators for wage near minimum wage -- to get rough measure of stacking;

gen byte eneu_lt105mw = 0;
replace eneu_lt105mw = 1 if rsal1<1.05*salmin;
gen byte eneu_lt110mw = 0;
replace eneu_lt110mw = 1 if rsal1<1.10*salmin;

* create indicators for permanent;
gen work_perm_imss = 0;
replace work_perm_imss = 1 if contract_type==10 & work_imss==1;

* create more aggregated firm size variables;
gen firmsize2 = 0 if inlist(firmsize, 1, 2, 3, 4, 5, 6);
replace firmsize2 = 1 if inlist(firmsize,7,8);

gen firmsize3 = 1 if inlist(firmsize, 1, 2, 3);
replace firmsize3 = 2 if inlist(firmsize, 4, 5);
replace firmsize3 = 3 if inlist(firmsize,6);
replace firmsize3 = 4 if inlist(firmsize,7);
replace firmsize3 = 5 if inlist(firmsize,8);

* create aggregated sector variable;
gen sector = .;
replace sector = 1 if inrange(rama,11,32) | inrange(rama,35,59);
replace sector = 2 if rama==60;
replace sector = 3 if inrange(rama,62,63);
replace sector = 4 if rama==73; * only includes subgrupo 7321;

tab esco1, missing;
tab work_imss, missing;
tab occ, missing;
tab age, missing;
tab age_cat, missing;
table age_cat2, missing;
tab married, missing;
tab metro_area, missing;
tab male, missing;
tab year, missing;
tab firmsize, missing;
tab sector, missing;

keep male age age_cat age_cat2 sal metro_area factor eneu subgrupo*
 last_job employee_type hrs_worked pay_type imss_d work_nopay temp_abs
 new_job work_lyr fulltime rama mnthwage mnthhour firmsize* qtr year
 entmpio_inegi rhsal* rsal* hsal sal relate_mw salmin salmindf zona
 cpi work work_all work_ft work_imss work_ft_imss work_perm_imss ind
 esco* occ married contract_type eneu_lt* sector unique_id qtr_in
 fe_flag;

sort year qtr age male ind metro_area;

* save version with imss employees;
save ${work}eneu_forimss_wpubsec_19882003, replace;

* save version without imss employees;
drop if sector==4;
save ${work}eneu_forimss_19882003, replace;

log close;

